001 /* 002 * CrochemoreLandauZivUkelsonGlobalAlignment.java 003 * 004 * Copyright 2003 Sergio Anibal de Carvalho Junior 005 * 006 * This file is part of NeoBio. 007 * 008 * NeoBio is free software; you can redistribute it and/or modify it under the terms of 009 * the GNU General Public License as published by the Free Software Foundation; either 010 * version 2 of the License, or (at your option) any later version. 011 * 012 * NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; 013 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR 014 * PURPOSE. See the GNU General Public License for more details. 015 * 016 * You should have received a copy of the GNU General Public License along with NeoBio; 017 * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, 018 * Boston, MA 02111-1307, USA. 019 * 020 * Proper attribution of the author as the source of the software would be appreciated. 021 * 022 * Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net 023 * Department of Computer Science http://www.dcs.kcl.ac.uk 024 * King's College London, UK http://www.kcl.ac.uk 025 * 026 * Please visit http://neobio.sourceforge.net 027 * 028 * This project was supervised by Professor Maxime Crochemore. 029 * 030 */ 031 032 package neobio.alignment; 033 034 /** 035 * This class implements the <B>global</B> pairwise sequence alignment algorithm (with 036 * linear gap penalty function) due to Maxime Crochemore, Gad Landau and Michal 037 * Ziv-Ukelson (2002). 038 * 039 * <P>This implementation derives from the paper of M.Crochemore, G.Landau and 040 * M.Ziv-Ukelson, <I>A Sub-quadratic Sequence Alignment Algorithm for Unrestricted Scoring 041 * Matrices</I> (available here as 042 * <A HREF="doc-files/Crochemore_Landau_Ziv-Ukelson_algorithm.pdf">PDF</A> or 043 * <A HREF="doc-files/Crochemore_Landau_Ziv-Ukelson_algorithm.pdf">Postscript</A>).</P> 044 * 045 * <P>For a general description of the algorithm, please refer to the specification of the 046 * abstract {@linkplain CrochemoreLandauZivUkelson} superclass.</P> 047 * 048 * <P>This class consist mainly of methods that:</P> 049 * 050 * <LU> 051 * <LI>create and compute all information of a block (see {@link #createBlock createBlock} 052 * and its variants); 053 * <LI>compute the output border of a block (see {@link #computeOutputBorder 054 * computeOutputBorder}; 055 * <LI>locate the score of a high scoring global alignment in the block table (see {@link 056 * #locateScore locateScore}; 057 * <LI>build an optimal global alignment from the information stored in the block table 058 * (see {@link #buildOptimalAlignment buildOptimalAlignment}. 059 * </LU> 060 * 061 * @see CrochemoreLandauZivUkelson 062 * @see CrochemoreLandauZivUkelsonLocalAlignment 063 * @author Sergio A. de Carvalho Jr. 064 */ 065 public class CrochemoreLandauZivUkelsonGlobalAlignment extends CrochemoreLandauZivUkelson 066 { 067 /** 068 * Creates and computes all information of an alignment block. Its main job is to 069 * compute the DIST column for the block. It then request the 070 * <CODE>computeOutputBorder</CODE> method to compute the block's output border. 071 * 072 * @param factor1 factor of the first sequence 073 * @param factor2 factor of the second sequence 074 * @param row row index of the block in the block table 075 * @param col column index of the block in the block table 076 * @return the computed block 077 * @throws IncompatibleScoringSchemeException if the scoring scheme is not compatible 078 * with the sequences being aligned 079 */ 080 protected AlignmentBlock createBlock (Factor factor1, Factor factor2, int row, 081 int col) throws IncompatibleScoringSchemeException 082 { 083 AlignmentBlock block, left_prefix, diag_prefix, top_prefix; 084 int size, lr, lc, score_ins, score_sub, score_del, ins, del, sub, max; 085 086 lr = factor1.length(); 087 lc = factor2.length(); 088 size = lr + lc + 1; 089 090 block = new AlignmentBlock (factor1, factor2, size); 091 092 // set up pointers to prefixes 093 left_prefix = getLeftPrefix (block); 094 diag_prefix = getDiagonalPrefix (block); 095 top_prefix = getTopPrefix (block); 096 097 // compute scores 098 score_ins = scoreInsertion (factor2.getNewChar()); 099 score_sub = scoreSubstitution (factor1.getNewChar(), factor2.getNewChar()); 100 score_del = scoreDeletion (factor1.getNewChar()); 101 102 // compute dist column and direction 103 for (int i = 0; i < size; i++) 104 { 105 // compute optimal path to 106 // input border's ith position 107 108 ins = sub = del = Integer.MIN_VALUE; 109 110 if (i < size - 1) 111 ins = left_prefix.dist_column[i] + score_ins; 112 113 if ((i > 0) && (i < size - 1)) 114 sub = diag_prefix.dist_column[i - 1] + score_sub; 115 116 if (i > 0) 117 del = top_prefix.dist_column[i - 1] + score_del; 118 119 block.dist_column[i] = max = max (ins, sub, del); 120 121 // record the direction to of the optimal 122 // path to input border's ith position 123 if (max == ins) 124 block.direction[i] = LEFT_DIRECTION; 125 else if (max == sub) 126 block.direction[i] = DIAGONAL_DIRECTION; 127 else 128 block.direction[i] = TOP_DIRECTION; 129 } 130 131 computeOutputBorder (block, row, col, size, lc, lr); 132 133 return block; 134 } 135 136 /** 137 * Creates the root block. This is a special case of the <CODE>createBlock</CODE> 138 * method. No information is actually computed. 139 * 140 * @param factor1 factor of the first sequence 141 * @param factor2 factor of the second sequence 142 * @return the root block 143 */ 144 protected AlignmentBlock createRootBlock (Factor factor1, Factor factor2) 145 { 146 return new AlignmentBlock (factor1, factor2); 147 } 148 149 /** 150 * Creates and computes all information of an alignment block of the first row of the 151 * block table. This is a special case of the <CODE>createBlock</CODE> method. 152 * 153 * @param factor1 factor of the first sequence 154 * @param factor2 factor of the second sequence 155 * @param col column index of the block in the block table 156 * @return the computed block 157 * @throws IncompatibleScoringSchemeException if the scoring scheme is not compatible 158 * with the sequences being aligned 159 * @see #createBlock createBlock 160 */ 161 protected AlignmentBlock createFirstRowBlock (Factor factor1, Factor factor2, int col) 162 throws IncompatibleScoringSchemeException 163 { 164 AlignmentBlock block, left_prefix; 165 int size, lr, lc, score_ins; 166 167 lr = 0; // factor1.length(); 168 lc = factor2.length(); 169 size = lr + lc + 1; 170 171 block = new AlignmentBlock (factor1, factor2, size); 172 173 // set up pointer to left prefix 174 left_prefix = getLeftPrefix (block); 175 176 // compute insertion's score 177 score_ins = scoreInsertion (factor2.getNewChar()); 178 179 // compute dist column and direction 180 for (int i = 0; i < lc; i++) 181 { 182 block.dist_column[i] = left_prefix.dist_column[i] + score_ins; 183 block.direction[i] = LEFT_DIRECTION; 184 } 185 186 // last position 187 block.dist_column[lc] = 0; 188 block.direction[lc] = STOP_DIRECTION; 189 190 computeOutputBorder (block, 0, col, size, lc, lr); 191 192 return block; 193 } 194 195 /** 196 * Creates and computes all information of an alignment block of the first column of 197 * the block table. This is a special case of the <CODE>createBlock</CODE> method. 198 * 199 * @param factor1 factor of the first sequence 200 * @param factor2 factor of the second sequence 201 * @param row row index of the block in the block table 202 * @return the computed block 203 * @throws IncompatibleScoringSchemeException if the scoring scheme is not compatible 204 * with the sequences being aligned 205 * @see #createBlock createBlock 206 */ 207 protected AlignmentBlock createFirstColumnBlock (Factor factor1, Factor factor2, 208 int row) throws IncompatibleScoringSchemeException 209 { 210 AlignmentBlock block, top_prefix; 211 int size, lr, lc, score_del; 212 213 lr = factor1.length(); 214 lc = 0; // factor2.length(); 215 size = lr + lc + 1; 216 217 block = new AlignmentBlock (factor1, factor2, size); 218 219 // set up pointer to top prefix 220 top_prefix = getTopPrefix (block); 221 222 // compute deletion's score 223 score_del = scoreDeletion (factor1.getNewChar()); 224 225 // first position 226 block.dist_column[0] = 0; 227 block.direction[0] = STOP_DIRECTION; 228 229 // compute dist column and direction 230 for (int i = 1; i < size; i++) 231 { 232 block.dist_column[i] = top_prefix.dist_column[i - 1] + score_del; 233 block.direction[i] = TOP_DIRECTION; 234 } 235 236 computeOutputBorder (block, row, 0, size, lc, lr); 237 238 return block; 239 } 240 241 /** 242 * Computes the output border of a block. This is performed in five steps: 243 * 244 * <LU> 245 * <LI>Retrieve the block's input border; 246 * <LI>Retrieve the block's complete DIST matrix; 247 * <LI>Create an interface to the {@linkplain OutMatrix OUT} matrix from the input 248 * border and DIST matrix; 249 * <LI>Use {@linkplain Smawk SMAWK} to compute all column maxima of the OUT matrix 250 * (SMAWK finds the index of the row that contains the maximum value of a column); 251 * <LI>Assemble the output border by extracting the maximum values of each column of 252 * the OUT matrix using the information obtained in the previous step. 253 * </LU> 254 * 255 * @param block the block for which the output border is to be computed 256 * @param row row index of the block in the block table 257 * @param col column index of the block in the block table 258 * @param dim dimension of the output border 259 * @param lc number of columns of the block 260 * @param lr number of row of the block 261 */ 262 protected void computeOutputBorder (AlignmentBlock block, int row, int col, int dim, 263 int lc, int lr) 264 { 265 int[] input = assembleInputBorder (dim, row, col, lr); 266 267 int[][] dist = assembleDistMatrix (block, dim, row, col, lc); 268 269 // update the interface to the OUT matrix 270 out_matrix.setData (dist, input, dim, lc); 271 272 // compute source_path using Smawk 273 smawk.computeColumnMaxima (out_matrix, block.source_path); 274 275 // update output border 276 for (int i = 0; i < dim; i++) 277 block.output_border[i] = out_matrix.valueAt(block.source_path[i], i); 278 } 279 280 /** 281 * Builds an optimal global alignment between the loaded sequences after the block 282 * table has been computed. This method traces a path back in the block table, from 283 * the last block to the first. 284 * 285 * @return an optimal global alignment 286 * @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible 287 * with the loaded sequences. 288 * @see CrochemoreLandauZivUkelson#traverseBlock 289 */ 290 protected PairwiseAlignment buildOptimalAlignment () 291 throws IncompatibleScoringSchemeException 292 { 293 AlignmentBlock block, ancestor; 294 StringBuffer gapped_seq1, tag_line, gapped_seq2; 295 int source, dest, ancestor_source; 296 int row, col; 297 298 gapped_seq1 = new StringBuffer(); 299 tag_line = new StringBuffer(); 300 gapped_seq2 = new StringBuffer(); 301 302 // start at the last row, last column of block table 303 row = num_rows - 1; col = num_cols - 1; 304 block = block_table[row][col]; 305 dest = block.factor2.length(); 306 307 while (row > 0 || col > 0) 308 { 309 block = block_table[row][col]; 310 source = block.source_path[dest]; 311 ancestor = block.ancestor[dest]; 312 313 ancestor_source = source; 314 if (dest > block.factor2.length()) 315 ancestor_source -= (block.factor1.length() - ancestor.factor1.length()); 316 317 traverseBlock (ancestor, ancestor_source, gapped_seq1, tag_line, gapped_seq2); 318 319 if (row == 0) 320 { 321 col = col - 1; 322 dest = block_table[row][col].factor2.length(); 323 } 324 else if (col == 0) 325 { 326 row = row - 1; 327 dest = 0; 328 } 329 else 330 { 331 if (source < block.factor1.length()) 332 { 333 col = col - 1; 334 dest = block_table[row][col].factor2.length() + source; 335 } 336 else if (source == block.factor1.length()) 337 { 338 row = row - 1; col = col - 1; 339 dest = block_table[row][col].factor2.length(); 340 } 341 else 342 { 343 row = row - 1; 344 dest = source - block.factor1.length(); 345 } 346 } 347 } 348 349 return new PairwiseAlignment (gapped_seq1.toString(), tag_line.toString(), 350 gapped_seq2.toString(), locateScore()); 351 } 352 353 /** 354 * Locate the score of the highest scoring global alignment in the block table. This 355 * value is found in the output border of the last block (last row, last column). 356 * 357 * @return the score of the highest scoring global alignment 358 */ 359 protected int locateScore () 360 { 361 AlignmentBlock last_block = block_table[num_rows - 1][num_cols - 1]; 362 363 return last_block.output_border[last_block.factor2.length()]; 364 } 365 }